import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import datetime
import ipywidgets as widgets
from IPython.display import display, Javascript
The data was in a parquet file, so we read it in a dataframe and converted it to a csv format
parquet_file = 'https://github.com/pegasus-99/NYC_yellowcab_data_visualization/blob/main/yellow_tripdata.parquet?raw=true'
df = pd.read_parquet(parquet_file, engine='auto')
df.to_csv('yellow_tripdata.csv')
df.head()
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | airport_fee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2022-08-01 00:17:39 | 2022-08-01 00:19:58 | 1.0 | 0.4 | 1.0 | N | 114 | 148 | 1 | 3.5 | 3.0 | 0.5 | 1.80 | 0.00 | 0.3 | 9.10 | 2.5 | 0.0 |
| 1 | 1 | 2022-08-01 00:26:06 | 2022-08-01 00:31:55 | 1.0 | 1.4 | 1.0 | N | 79 | 137 | 1 | 6.5 | 3.0 | 0.5 | 2.05 | 0.00 | 0.3 | 12.35 | 2.5 | 0.0 |
| 2 | 1 | 2022-08-01 00:45:49 | 2022-08-01 00:59:29 | 1.0 | 5.2 | 1.0 | N | 79 | 74 | 1 | 16.0 | 3.0 | 0.5 | 3.95 | 0.00 | 0.3 | 23.75 | 2.5 | 0.0 |
| 3 | 1 | 2022-08-01 00:05:49 | 2022-08-01 00:25:42 | 1.0 | 9.4 | 1.0 | N | 138 | 113 | 1 | 28.0 | 3.0 | 0.5 | 7.65 | 6.55 | 0.3 | 46.00 | 2.5 | 0.0 |
| 4 | 1 | 2022-08-01 00:36:29 | 2022-08-01 00:51:29 | 1.0 | 1.7 | 1.0 | N | 137 | 68 | 1 | 11.0 | 3.0 | 0.5 | 1.00 | 0.00 | 0.3 | 15.80 | 2.5 | 0.0 |
# Converting pickup and dropoff columns from object to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df.isnull().sum()
VendorID 0 tpep_pickup_datetime 0 tpep_dropoff_datetime 0 passenger_count 93174 trip_distance 0 RatecodeID 93174 store_and_fwd_flag 93174 PULocationID 0 DOLocationID 0 payment_type 0 fare_amount 0 extra 0 mta_tax 0 tip_amount 0 tolls_amount 0 improvement_surcharge 0 total_amount 0 congestion_surcharge 93174 airport_fee 93174 dtype: int64
Out of 3.1 million rows, only around 93,000 of them seem to have null values. Which is not a very significant amount
# Creating columns for pickup date and pickup time from the datetime format, same for dropoff
df['pickup_date'] = [d.date() for d in df['tpep_pickup_datetime']]
df['pickup_time'] = [d.time() for d in df['tpep_pickup_datetime']]
df['dropoff_date'] = [d.date() for d in df['tpep_dropoff_datetime']]
df['dropoff_time'] = [d.time() for d in df['tpep_dropoff_datetime']]
df.dtypes
VendorID int64 tpep_pickup_datetime datetime64[ns] tpep_dropoff_datetime datetime64[ns] passenger_count float64 trip_distance float64 RatecodeID float64 store_and_fwd_flag object PULocationID int64 DOLocationID int64 payment_type int64 fare_amount float64 extra float64 mta_tax float64 tip_amount float64 tolls_amount float64 improvement_surcharge float64 total_amount float64 congestion_surcharge float64 airport_fee float64 pickup_date object pickup_time object dropoff_date object dropoff_time object dtype: object
# Converting pickup date and dropoff date into datetime format
df['pickup_date'] = pd.to_datetime(df['pickup_date'])
df['dropoff_date'] = pd.to_datetime(df['dropoff_date'])
# Grouping by pickup date and location and getting the sum of all the values relating to the fare
df_group_date = df[['fare_amount', 'extra',
'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
'total_amount','pickup_date','PULocationID']].groupby(['pickup_date', 'PULocationID']).sum()
# Resetting the index to get values of the sum for each location and date
df_group_date.reset_index(level=[0,1], inplace=True)
Next I created a dates and locations variable which had a list of all the unique pickup dates and pickup locations respectively
dates = list(df['pickup_date'].unique())
locations = list(df['PULocationID'].unique())
Here we created an interactive function that allows the user to pick the date and location to view the total amount of money earned from pickups in the chosen area at the chosen date. There is also a breakdown of the total cost into its many components
@widgets.interact(pickup_date=dates, PULocationID = locations)
def date_location_select(pickup_date = 'pickup_date', PULocationID ='PULocationID'):
z = df_group_date[(df_group_date['pickup_date']==pickup_date) & (df_group_date['PULocationID'] == PULocationID)]
z = z.drop(['pickup_date', 'PULocationID'], axis=1)
if z.empty:
return False
ax = z.T.plot(kind="bar", rot=0, title='Breakdown of toal amount paid by day and location', figsize=(10,6))
ax.set_xticklabels(list(z.columns), rotation = 90)
ax.set_xlabel("Fare breakdown")
ax.set_ylabel("Total cost for location")
ax.get_legend().remove()
data = df.copy()
# Checking statistial values of various columns
pd.set_option('display.float_format', lambda x: '%.4f' % x)
data.describe()
| VendorID | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | airport_fee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3152677.0000 | 3059503.0000 | 3152677.0000 | 3059503.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3152677.0000 | 3059503.0000 | 3059503.0000 |
| mean | 1.7249 | 1.4231 | 5.7163 | 1.4722 | 163.3933 | 160.9009 | 1.2096 | 14.8191 | 0.9886 | 0.4875 | 2.6663 | 0.5629 | 0.2956 | 21.5539 | 2.2564 | 0.1087 |
| std | 0.4730 | 0.9814 | 565.1028 | 6.1248 | 64.8108 | 70.1000 | 0.5158 | 14.8066 | 1.2382 | 0.0979 | 3.2889 | 2.0721 | 0.0509 | 18.2757 | 0.7893 | 0.3564 |
| min | 1.0000 | 0.0000 | 0.0000 | 1.0000 | 1.0000 | 1.0000 | 0.0000 | -900.0000 | -5.0000 | -0.5000 | -155.0000 | -54.0000 | -0.3000 | -895.3000 | -2.5000 | -1.2500 |
| 25% | 1.0000 | 1.0000 | 1.1300 | 1.0000 | 132.0000 | 112.0000 | 1.0000 | 7.0000 | 0.0000 | 0.5000 | 0.0000 | 0.0000 | 0.3000 | 12.3000 | 2.5000 | 0.0000 |
| 50% | 2.0000 | 1.0000 | 1.9300 | 1.0000 | 161.0000 | 162.0000 | 1.0000 | 10.0000 | 0.5000 | 0.5000 | 2.0600 | 0.0000 | 0.3000 | 15.8700 | 2.5000 | 0.0000 |
| 75% | 2.0000 | 2.0000 | 3.7000 | 1.0000 | 233.0000 | 233.0000 | 1.0000 | 16.5000 | 2.5000 | 0.5000 | 3.2600 | 0.0000 | 0.3000 | 22.8000 | 2.5000 | 0.0000 |
| max | 6.0000 | 9.0000 | 330578.6600 | 99.0000 | 265.0000 | 265.0000 | 5.0000 | 1136.5000 | 9.0500 | 3.3000 | 400.0000 | 139.4500 | 0.3000 | 1139.8000 | 2.7500 | 1.2500 |
# Converting columns to datetime
data['tpep_pickup_datetime']=pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime']=pd.to_datetime(data['tpep_dropoff_datetime'])
# Creating columns for day, weekday, hour and month for pickup and dropoff
data['pickup_day']=data['tpep_pickup_datetime'].dt.day_name()
data['dropoff_day']=data['tpep_dropoff_datetime'].dt.day_name()
data['pickup_weekday']=data['tpep_pickup_datetime'].dt.weekday
data['dropoff_weekday']=data['tpep_dropoff_datetime'].dt.weekday
data['pickup_hour']=data['tpep_pickup_datetime'].dt.hour
data['dropoff_hour']=data['tpep_dropoff_datetime'].dt.hour
data['pickup_month']=data['tpep_pickup_datetime'].dt.month
data['dropoff_month']=data['tpep_dropoff_datetime'].dt.month
# Creating a column to calculate the tip percent
data["tip_percent"] = data["tip_amount"] * 100 / (data["tip_amount"] + data["fare_amount"])
# Plotting average tip percent per day
plt.ticklabel_format(style='plain') # to prevent scientific notation.
n_by_pickup_percent = data.groupby("pickup_weekday")["tip_percent"].mean()
ax = n_by_pickup_percent.sort_values(ascending = False).plot.bar()
plt.title('Tip Percent Average per day')
plt.xlabel('Pickup Day of week')
plt.ylabel('Tip Percent')
plt.show()
# Average tip percent per hour of the day
plt.ticklabel_format(style='plain') # to prevent scientific notation.
n_by_pickup_percent = data.groupby("pickup_hour")["tip_percent"].mean()
ax = n_by_pickup_percent.sort_values(ascending = False).plot.bar()
plt.title('Tip Percent Average per hour')
plt.xlabel('Pickup Hour')
plt.ylabel('Tip Percent')
plt.show()
# Average tip percent for the top 40 dropoff locations
plt.ticklabel_format(style='plain') # to prevent scientific notation.
n_by_pickup_percent = data.groupby("DOLocationID")["tip_percent"].mean().sort_values(ascending = False).plot.bar()
#n_by_pickup_percent = n_by_pickup_percent.sort_values(ascending = False).plot.bar()
plt.title('Tip Percent Average per Drop off Location')
plt.xlabel('DOLocationID')
plt.ylabel('Tip Percent')
plt.xlim((1,40))
plt.show()
plt.figure(figsize=(18,6))
sns.histplot(data=data, x='pickup_day', hue='pickup_day', palette="husl")
plt.title("Which days see the most customers?")
plt.xlabel('Days of the week')
plt.ylabel('Pickup count')
plt.show()
From the above visualization, we can conclude that Tuesdays and Wednesdays are the busiest days of the week. Taxis are used the least on Sundays.
As a Taxi Driver, I will know that I am more likely to get more customers on Tuesdays and Wednesdays.
plt.figure(figsize=(18,6))
sns.countplot(data=data, x='pickup_hour', palette="husl")
plt.title("Which hours of the day see the most customers?")
plt.xlabel('Hours of the day')
plt.ylabel('Pickup count')
plt.show()
From this visualization, we can dedude that afternoon and evening hours see more customers. The busiest hours of the day are between 10 am to 9 pm.
data.VendorID.unique()
array([1, 2, 6, 5])
plt.figure(figsize=(18,6))
sns.countplot(data=data, x='pickup_day', hue='VendorID', palette="husl")
plt.title("Which Vendor are preferred on a given day?")
plt.xlabel('Vendors')
plt.ylabel('Pickup count')
plt.show()
On most days, Vendor 2 followed by Vendor 1 is preferred by Customers.
data.payment_type.unique()
array([1, 2, 3, 4, 5, 0])
plt.figure(figsize=(18,6))
sns.countplot(data=data, x='pickup_day', hue='payment_type', palette="husl")
plt.title("Which payment methods are preferred on a given day?")
plt.xlabel('Payment Methods used during the trips')
plt.ylabel('Pickup count')
plt.show()
On all days of the week, payment type 1 is preferred. While payment type 4 and 5 are rarely used.
plt.figure(figsize=(20,6))
sns.lineplot(data=data, x="pickup_hour", y="tip_amount", hue="pickup_day", style="pickup_day")
plt.title("Tip amount according to the hour and day of the week")
plt.xlabel('Hour of the day')
plt.ylabel('Tip Amount')
plt.legend(title = "Days of the Week")
plt.show()
From this visualization, we can conclude that higher tips are given after midnight especially at 12-1 am and 4-6am.
plt.figure(figsize=(20,6))
sns.lineplot(data=data, x="pickup_hour", y="fare_amount", hue="pickup_day")
#plt.ylim([1,3,5,7,9,11,13,15,17,19,21,23])
plt.title("Fare amount according to the hour and day of the week")
plt.xlabel('Hour of the day')
plt.ylabel('Fare Amount')
plt.legend(title = "Days of the Week")
plt.show()
From this visualization, we dedude that fares are higher after midnight. I'm guessing that taxis have a higher rate for night time.
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="pickup_day", y="tip_amount", hue="pickup_day", palette="husl")
plt.ylim(0,200)
plt.title("Which days see the most tips?")
plt.xlabel('Days of the week')
plt.ylabel('Tip Amount')
plt.legend(title = "Days of the Week")
plt.show()
/usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py:128: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
This visualization needs to be worked on further. I can't accurately tell which days receive the most tips from the customers.
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="pickup_day", y="trip_distance", hue="pickup_day", palette="husl")
plt.ylim(0,350)
plt.title("Which days see the most distance covered by taxis?")
plt.xlabel('Days of the week')
plt.ylabel('Trip Distance')
plt.legend(title = "Days of the Week")
plt.show()
/usr/local/lib/python3.8/dist-packages/IPython/core/pylabtools.py:128: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
Similarly, I can't accurately tell which ones cover the most distance. Need to be worked upon.
criteria_rb = widgets.RadioButtons(
options=['extra', 'mta_tax', 'tolls_amount', 'airport_fee'],
description='Other fares:',
disabled=False)
def plot_extra(criteria):
plt.figure(figsize=(12, 6))
sns.barplot(x= 'pickup_day', y=criteria, data=data, palette="husl")
plt.title("Other fares that are charged across the week")
widgets.interact(plot_extra, criteria = criteria_rb)
<function __main__.plot_extra(criteria)>
There are some extra charges on top of the fare amount, such as - mta tax, tolls amount, airport fee etc. I wanted to visualize - which of these extra fees are charged more or less given a particular day of the week.
fig = px.scatter(
df,
x = 'fare_amount',
y = 'tip_amount',
)
fig.show()